In [1]:
    
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import grid_search
from sklearn import metrics
from sklearn import cross_validation
from sklearn.externals import joblib
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import itertools
import random
import os
import pickle
import time
    
In [2]:
    
DATA_DIRECTORY = "E:\\eaglesense\\data\\topviewkinect"
PREPROCESSED_DIRECTORY = DATA_DIRECTORY + "\\all"
FEATURE_SET = "eval-chi2"
    
In [3]:
    
if not os.path.exists("results"):
    os.makedirs("results")
    
In [4]:
    
features_csv = "{root}/{tag}_features.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
features_df = pd.read_csv(features_csv)
    
In [5]:
    
features_df.head()
    
    Out[5]:
In [6]:
    
labels_csv = "{root}/{tag}_labels.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
labels_df = pd.read_csv(labels_csv)
    
In [7]:
    
s1_data_path = "{root}/{tag}_s1_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
s2_data_path = "{root}/{tag}_s2_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
cs_data_path = "{root}/{tag}_cs_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
noinfrared_data_path = "{root}/{tag}_cs_noinfrared_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
with open(s1_data_path, "rb") as f:
    s1_data = pickle.load(f)
    
with open(s2_data_path, "rb") as f:
    s2_data = pickle.load(f)
with open(cs_data_path, "rb") as f:
    cs_data = pickle.load(f)
with open(noinfrared_data_path, "rb") as f:
    noinfrared_data = pickle.load(f)
    
In [8]:
    
unique_subjects = features_df["subject"].unique()
unique_subjects
    
    Out[8]:
In [9]:
    
ACTIVITIES = ["Standing", "Sitting", "Pointing", "Phone", "Tablet", "Paper"]
    
In [10]:
    
num_activities = len(ACTIVITIES)
num_activities
    
    Out[10]:
In [11]:
    
XGB_PARAM_FINAL = {}
XGB_PARAM_FINAL["eta"] = 0.3
XGB_PARAM_FINAL["gamma"] = 1
XGB_PARAM_FINAL["lambda"] = 1
XGB_PARAM_FINAL["alpha"] = 0
XGB_PARAM_FINAL["max_depth"] = 6
XGB_PARAM_FINAL["colsample_bytree"] = 0.5
XGB_PARAM_FINAL["colsample_bylevel"] = 0.5
XGB_PARAM_FINAL["subsample"] = 0.5
XGB_PARAM_FINAL["objective"] = "multi:softmax"
XGB_PARAM_FINAL["eval_metric"] = "merror"
XGB_PARAM_FINAL["num_class"] = len(ACTIVITIES)
XGB_PARAM_FINAL["silent"] = 0
XGB_NUM_ROUNDS = 200
XGB_EARLYSTOPPING_ROUNDS = 30
    
In [12]:
    
def crosssubject_test_split(features_df, labels_df, training_subjects_ids):
    num_features = features_df.shape[1] - 1
    
    X_train = np.array([], dtype=np.float64).reshape(0, num_features)
    y_train = np.array([], dtype=np.int32).reshape(0, 1)
    X_test = np.array([], dtype=np.float64).reshape(0, num_features)
    y_test = np.array([], dtype=np.int32).reshape(0, 1)
    for subject_id in unique_subjects:
        subject_features = features_df[features_df["subject"] == subject_id]
        subject_features = subject_features.drop(["subject"], axis=1)
        subject_labels = labels_df[labels_df["subject"] == subject_id]
        subject_labels = subject_labels[["activity"]]
        subject_X = subject_features.values
        subject_y = subject_labels.values
        if subject_id in training_subjects_ids:
            X_train = np.vstack([X_train, subject_X])
            y_train = np.vstack([y_train, subject_y])
        else:
            X_test = np.vstack([X_test, subject_X])
            y_test = np.vstack([y_test, subject_y])
    
    return X_train, y_train, X_test, y_test
    
In [13]:
    
def get_normalized_confusion_matrix(y_true, y_predicted):
    confusion_matrix = metrics.confusion_matrix(y_true, y_predicted)
    confusion_matrix_normalized = confusion_matrix.astype("float") / confusion_matrix.sum(axis=1)[:, np.newaxis]
    confusion_matrix_normalized *= 100
    return confusion_matrix_normalized
    
In [14]:
    
s1_X_train = s1_data["X_train"]
s1_y_train = s1_data["y_train"]
s1_X_test = s1_data["X_test"]
s1_y_test = s1_data["y_test"]
    
In [15]:
    
s1_X_train.shape
    
    Out[15]:
In [16]:
    
s1_X_test.shape
    
    Out[16]:
In [17]:
    
s1_train_xgbmatrix = xgb.DMatrix(s1_X_train, s1_y_train)
s1_test_xgbmatrix = xgb.DMatrix(s1_X_test, s1_y_test)
s1_watchlist = [(s1_train_xgbmatrix, "train"), (s1_test_xgbmatrix, "eval")]
    
In [18]:
    
s1_eval_results = {}
s1_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=s1_train_xgbmatrix, evals=s1_watchlist, evals_result=s1_eval_results,
                          num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
    
    
In [19]:
    
s1_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=s1_train_xgbmatrix, num_boost_round=s1_validation.best_iteration+1)
    
In [20]:
    
s1_y_predicted = s1_booster.predict(s1_test_xgbmatrix)
    
In [21]:
    
s1_accuracy = metrics.accuracy_score(s1_y_test, s1_y_predicted)
s1_accuracy
    
    Out[21]:
In [22]:
    
s1_confusion_matrix = get_normalized_confusion_matrix(s1_y_test, s1_y_predicted)
    
In [23]:
    
s1_results_dump = {
    "eval_results": s1_eval_results,
    "eval_earlystoppping_best_iteration": s1_validation.best_iteration+1,
    "eval_earlystoppping_best_score": s1_validation.best_score,
    "classifier": s1_booster,
    "final_accuracy": s1_accuracy,
    "final_confusion_matrix": s1_confusion_matrix
}
with open("results/s1.pickle", "wb") as f:
    pickle.dump(s1_results_dump, f)
    
In [24]:
    
s2_X_train = s2_data["X_train"]
s2_y_train = s2_data["y_train"]
s2_X_test = s2_data["X_test"]
s2_y_test = s2_data["y_test"]
    
In [25]:
    
s2_X_train.shape
    
    Out[25]:
In [26]:
    
s2_X_test.shape
    
    Out[26]:
In [27]:
    
s2_train_xgbmatrix = xgb.DMatrix(s2_X_train, s2_y_train)
s2_test_xgbmatrix = xgb.DMatrix(s2_X_test, s2_y_test)
s2_watchlist = [(s2_train_xgbmatrix, "train"), (s2_test_xgbmatrix, "eval")]
    
In [28]:
    
s2_eval_results = {}
s2_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=s2_train_xgbmatrix, evals=s2_watchlist, evals_result=s2_eval_results, 
                          num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
    
    
In [29]:
    
s2_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=s2_train_xgbmatrix, num_boost_round=s2_validation.best_iteration+1)
    
In [30]:
    
s2_y_predicted = s2_booster.predict(s2_test_xgbmatrix)
    
In [31]:
    
s2_accuracy = metrics.accuracy_score(s2_y_test, s2_y_predicted)
s2_accuracy
    
    Out[31]:
In [32]:
    
s2_confusion_matrix = get_normalized_confusion_matrix(s2_y_test, s2_y_predicted)
    
In [33]:
    
s2_results_dump = {
    "eval_results": s2_eval_results,
    "eval_earlystoppping_best_iteration": s2_validation.best_iteration+1,
    "eval_earlystoppping_best_score": s2_validation.best_score,
    "classifier": s2_booster,
    "final_accuracy": s2_accuracy,
    "final_confusion_matrix": s2_confusion_matrix
}
with open("results/s2.pickle", "wb") as f:
    pickle.dump(s2_results_dump, f)
    
In [14]:
    
cs_X_train = cs_data["X_train"]
cs_y_train = cs_data["y_train"]
cs_X_test = cs_data["X_test"]
cs_y_test = cs_data["y_test"]
    
In [16]:
    
cs_X_train.shape
    
    Out[16]:
In [17]:
    
cs_X_test.shape
    
    Out[17]:
In [37]:
    
from sklearn import ensemble
    
In [38]:
    
rf_clf = ensemble.RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=None, max_features="sqrt", 
                                         random_state=42, n_jobs=-1)
    
In [39]:
    
rf_training_start = time.time()
rf_clf.fit(cs_X_train, cs_y_train.ravel())
rf_training_time = (time.time() - rf_training_start)
rf_training_time
    
    Out[39]:
In [40]:
    
rf_testing_start = time.time()
rf_y_predicted = rf_clf.predict(cs_X_test)
rf_testing_time = (time.time() - rf_testing_start)
rf_testing_time
    
    Out[40]:
In [41]:
    
rf_y_train_predicted = rf_clf.predict(cs_X_train)
rf_train_accuracy = metrics.accuracy_score(cs_y_train, rf_y_train_predicted)
rf_train_accuracy
    
    Out[41]:
In [42]:
    
rf_accuracy = metrics.accuracy_score(cs_y_test, rf_y_predicted)
rf_accuracy
    
    Out[42]:
In [43]:
    
rf_confusion_matrix = get_normalized_confusion_matrix(cs_y_test, rf_y_predicted)
    
In [44]:
    
rf_results_dump = {
    "training_time": rf_training_time,
    "testing_time": rf_testing_time,
    "training_accuracy": rf_train_accuracy,
    "final_accuracy": rf_accuracy,
    "final_confusion_matrix": rf_confusion_matrix
}
with open("results/cs_rf.pickle", "wb") as f:
    pickle.dump(rf_results_dump, f)
    
In [53]:
    
cs_X_train.shape
    
    Out[53]:
In [54]:
    
cs_X_test.shape
    
    Out[54]:
In [20]:
    
cs_train_xgbmatrix = xgb.DMatrix(cs_X_train, cs_y_train)
cs_test_xgbmatrix = xgb.DMatrix(cs_X_test, cs_y_test)
cs_watchlist = [(cs_train_xgbmatrix, "train"), (cs_test_xgbmatrix, "eval")]
    
In [21]:
    
cs_eval_results = {}
cs_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=cs_train_xgbmatrix, evals=cs_watchlist, evals_result=cs_eval_results,
                          num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
    
    
In [22]:
    
xgboost_training_start = time.time()
cs_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=cs_train_xgbmatrix, num_boost_round=cs_validation.best_iteration+1)
xgboost_training_time = (time.time() - xgboost_training_start)
xgboost_training_time
    
    Out[22]:
In [55]:
    
total_time = list()
    
In [ ]:
    
# training
for i in range(cs_X_train.shape[0]):
    x = cs_X_train[i,:]
    x = x.reshape((1, 72))
    x_dmatrix = xgb.DMatrix(x)
    start = time.time()
    cs_booster.predict(x_dmatrix)
    total_time.append(time.time() - start)
# testing
for i in range(cs_X_test.shape[0]):
    x = cs_X_test[i,:]
    x = x.reshape((1, 72))
    x_dmatrix = xgb.DMatrix(x)
    start = time.time()
    cs_booster.predict(x_dmatrix)
    total_time.append(time.time() - start)
    
In [ ]:
    
avg_time = np.mean(total_time)
    
In [ ]:
    
avg_time * 1000
    
In [ ]:
    
std_time = np.std(total_time)
    
In [ ]:
    
std_time * 1000
    
In [50]:
    
xgboost_testing_start = time.time()
cs_y_predicted = cs_booster.predict(cs_test_xgbmatrix)
xgboost_testing_time = (time.time() - xgboost_testing_start)
xgboost_testing_time
    
    Out[50]:
In [51]:
    
cs_y_train_predicted = rf_clf.predict(cs_X_train)
cs_train_accuracy = metrics.accuracy_score(cs_y_train, cs_y_train_predicted)
cs_train_accuracy
    
    Out[51]:
In [52]:
    
cs_accuracy = metrics.accuracy_score(cs_y_test, cs_y_predicted)
cs_accuracy
    
    Out[52]:
In [53]:
    
cs_confusion_matrix = get_normalized_confusion_matrix(cs_y_test, cs_y_predicted)
    
In [54]:
    
cs_confusion_matrix_subjects = list()
for subject_id in unique_subjects:
    subject_features = features_df[features_df["subject"] == subject_id]
    subject_features = subject_features.drop(["subject"], axis=1)
    subject_labels = labels_df[labels_df["subject"] == subject_id]
    subject_labels = subject_labels[["activity"]]
    subject_X = subject_features.values
    subject_y = subject_labels.values
    subject_xgbmatrix = xgb.DMatrix(subject_X, subject_y)
    subject_y_predicted = cs_booster.predict(subject_xgbmatrix)
    
    subject_accuracy = metrics.accuracy_score(subject_y, subject_y_predicted)
    subject_confusion_matrix = get_normalized_confusion_matrix(subject_y, subject_y_predicted)
    cs_confusion_matrix_subjects.append((subject_id, subject_accuracy, subject_confusion_matrix))
    
In [55]:
    
for activity_idx, activity in enumerate(ACTIVITIES):
    activity_accuracy = cs_confusion_matrix[activity_idx, activity_idx]
    activity_error = 100 - activity_accuracy
    print(activity, "\tAccuracy:", activity_accuracy, "\tError:", activity_error)
    
    
In [56]:
    
cs_results_dump = {
    "training_time": xgboost_training_time,
    "testing_time": xgboost_testing_time,
    "eval_results": cs_eval_results,
    "eval_earlystoppping_best_iteration": cs_validation.best_iteration+1,
    "eval_earlystoppping_best_score": cs_validation.best_score,
    "classifier": cs_booster,
    "training_accuracy": cs_train_accuracy,
    "final_accuracy": cs_accuracy,
    "final_confusion_matrix": cs_confusion_matrix,
    "subject_confusion_matrix": cs_confusion_matrix_subjects
}
with open("results/cs.pickle", "wb") as f:
    pickle.dump(cs_results_dump, f)
    
In [57]:
    
noinfrared_X_train = noinfrared_data["X_train"]
noinfrared_y_train = noinfrared_data["y_train"]
noinfrared_X_test = noinfrared_data["X_test"]
noinfrared_y_test = noinfrared_data["y_test"]
    
In [58]:
    
noinfrared_X_train.shape
    
    Out[58]:
In [59]:
    
noinfrared_X_test.shape
    
    Out[59]:
In [60]:
    
noinfrared_train_xgbmatrix = xgb.DMatrix(noinfrared_X_train, noinfrared_y_train)
noinfrared_test_xgbmatrix = xgb.DMatrix(noinfrared_X_test, noinfrared_y_test)
noinfrared_watchlist = [(noinfrared_train_xgbmatrix, "train"), (noinfrared_test_xgbmatrix, "eval")]
    
In [61]:
    
noinfrared_eval_results = {}
noinfrared_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=noinfrared_train_xgbmatrix, evals=noinfrared_watchlist,
                                  evals_result=noinfrared_eval_results, num_boost_round=XGB_NUM_ROUNDS, 
                                  early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
    
    
In [62]:
    
noinfrared_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=noinfrared_train_xgbmatrix,
                               num_boost_round=noinfrared_validation.best_iteration+1)
    
In [63]:
    
noinfrared_y_predicted = noinfrared_booster.predict(noinfrared_test_xgbmatrix)
    
In [64]:
    
noinfrared_accuracy = metrics.accuracy_score(noinfrared_y_test, noinfrared_y_predicted)
noinfrared_accuracy
    
    Out[64]:
In [65]:
    
noinfrared_confusion_matrix = get_normalized_confusion_matrix(noinfrared_y_test, noinfrared_y_predicted)
    
In [66]:
    
noninfrared_results_dump = {
    "eval_results": noinfrared_eval_results,
    "eval_earlystoppping_best_iteration": noinfrared_validation.best_iteration+1,
    "eval_earlystoppping_best_score": noinfrared_validation.best_score,
    "classifier": noinfrared_booster,
    "final_accuracy": noinfrared_accuracy,
    "final_confusion_matrix": noinfrared_confusion_matrix,
}
with open("results/cs_noinfrared.pickle", "wb") as f:
    pickle.dump(noninfrared_results_dump, f)
    
In [67]:
    
cs_combinations = list(itertools.combinations(unique_subjects, int(len(unique_subjects)/2)))
len(cs_combinations)
    
    Out[67]:
In [68]:
    
cs_combinations_results_csv = "results/cs_combinations.csv"
    
In [69]:
    
open(cs_combinations_results_csv, "w").close()
with open(cs_combinations_results_csv, "a") as f:
    data_columns = pd.DataFrame(columns=["combination", "activity", "a1", "a2", "a3", "a4", "a5", "a6"])
    data_columns.to_csv(f, header=True, index=False)
    
In [70]:
    
for cs_combination_idx, cs_combination in enumerate(cs_combinations):
    print(cs_combination_idx, "... ", end="")
    # Get data
    combination_X_train, combination_y_train, combination_X_test, combination_y_test = crosssubject_test_split(
        features_df, labels_df, cs_combination)
    combination_train_xgbmatrix = xgb.DMatrix(combination_X_train, combination_y_train)
    combination_test_xgbmatrix = xgb.DMatrix(combination_X_test, combination_y_test)
    # Train
    combination_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=combination_train_xgbmatrix, num_boost_round=cs_validation.best_iteration+1)
    combination_y_predicted = combination_booster.predict(combination_test_xgbmatrix)
    
    # Get results
    combination_results = metrics.confusion_matrix(combination_y_test, combination_y_predicted)
    combination_results_df = pd.DataFrame(columns=["combination", "activity", "a1", "a2", "a3", "a4", "a5", "a6"])
    for activity_id, activity in enumerate(ACTIVITIES):
        combination_results_df.loc[activity_id] = [
            cs_combination_idx, activity, 
            combination_results[activity_id,0], combination_results[activity_id,1], combination_results[activity_id,2], 
            combination_results[activity_id,3], combination_results[activity_id,4], combination_results[activity_id,5]
        ]
    # Append results
    with open(cs_combinations_results_csv, "a") as f:
        combination_results_df.to_csv(f, header=False, index=False)
    
    
In [71]:
    
combinations_results_df = pd.read_csv(cs_combinations_results_csv)
    
In [72]:
    
combinations_confusion_matrix = np.zeros((num_activities, num_activities))
for activity_idx, activity in enumerate(ACTIVITIES):
    combinations_activity_results = combinations_results_df[combinations_results_df["activity"] == activity]
    for accuracy_idx, accuracy_column in enumerate(["a1", "a2", "a3", "a4", "a5", "a6"]):
        combinations_confusion_matrix[activity_idx, accuracy_idx] = combinations_activity_results[accuracy_column].sum()
combinations_confusion_matrix_normalized = combinations_confusion_matrix.astype("float") / combinations_confusion_matrix.sum(axis=1)[:, np.newaxis]
combinations_confusion_matrix_normalized *= 100
    
In [73]:
    
all_samples = np.sum(combinations_confusion_matrix)
    
In [74]:
    
accurate_samples = 0
for activity_id in range(len(ACTIVITIES)):
    accurate_samples += combinations_confusion_matrix[activity_id, activity_id]
    
In [75]:
    
combinations_accuracy = accurate_samples / all_samples
combinations_accuracy
    
    Out[75]:
In [76]:
    
combinations_results_dump = {
    "accuracy": combinations_accuracy,
    "confusion_matrix": combinations_confusion_matrix_normalized,
}
    
In [77]:
    
with open("results/cs_combinations.pickle", "wb") as f:
    pickle.dump(combinations_results_dump, f)
    
In [78]:
    
# X.shape
    
In [79]:
    
# y.shape
    
In [80]:
    
# demo_train_xgbmatrix = xgb.DMatrix(X, y)
# demo_test_xgbmatrix = xgb.DMatrix(X, y)
# demo_watchlist = [(demo_train_xgbmatrix, "train"), (demo_test_xgbmatrix, "eval")]
    
In [81]:
    
# demo_results = {}
# demo_booster = xgb.train(XGB_PARAM_DEMO, demo_train_xgbmatrix, XGB_NUM_ROUNDS_DEMO, demo_watchlist, evals_result=demo_results, early_stopping_rounds=20)
    
In [82]:
    
# demo_booster.save_model("demo-xgboost.model")
    
In [83]:
    
# bst2 = xgb.Booster(model_file="demo-xgboost.model")
    
In [84]:
    
# test_dmatrix = xgb.DMatrix(X)
# y_predicted = bst2.predict(test_dmatrix)
# accuracy = metrics.accuracy_score(y, y_predicted)
    
In [85]:
    
# accuracy